Money ball data is used through out the lecture series.
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(dslabs)
library(Lahman)
str(Teams)
'data.frame': 2835 obs. of 48 variables:
$ yearID : int 1871 1871 1871 1871 1871 1871 1871 1871 1871 1872 ...
$ lgID : Factor w/ 7 levels "AA","AL","FL",..: 4 4 4 4 4 4 4 4 4 4 ...
$ teamID : Factor w/ 149 levels "ALT","ANA","ARI",..: 24 31 39 56 90 97 111 136 142 8 ...
$ franchID : Factor w/ 120 levels "ALT","ANA","ARI",..: 13 36 25 56 70 85 91 109 77 9 ...
$ divID : chr NA NA NA NA ...
$ Rank : int 3 2 8 7 5 1 9 6 4 2 ...
$ G : int 31 28 29 19 33 28 25 29 32 58 ...
$ Ghome : int NA NA NA NA NA NA NA NA NA NA ...
$ W : int 20 19 10 7 16 21 4 13 15 35 ...
$ L : int 10 9 19 12 17 7 21 15 15 19 ...
$ DivWin : chr NA NA NA NA ...
$ WCWin : chr NA NA NA NA ...
$ LgWin : chr "N" "N" "N" "N" ...
$ WSWin : chr NA NA NA NA ...
$ R : int 401 302 249 137 302 376 231 351 310 617 ...
$ AB : int 1372 1196 1186 746 1404 1281 1036 1248 1353 2576 ...
$ H : int 426 323 328 178 403 410 274 384 375 747 ...
$ X2B : int 70 52 35 19 43 66 44 51 54 94 ...
$ X3B : int 37 21 40 8 21 27 25 34 26 35 ...
$ HR : int 3 10 7 2 1 9 3 6 6 14 ...
$ BB : int 60 60 26 33 33 46 38 49 48 27 ...
$ SO : int 19 22 25 9 15 23 30 19 13 28 ...
$ SB : int 73 69 18 16 46 56 53 62 48 35 ...
$ CS : int NA NA NA NA NA NA NA NA NA 15 ...
$ HBP : int NA NA NA NA NA NA NA NA NA NA ...
$ SF : int NA NA NA NA NA NA NA NA NA NA ...
$ RA : int 303 241 341 243 313 266 287 362 303 434 ...
$ ER : int 109 77 116 97 121 137 108 153 137 173 ...
$ ERA : num 3.55 2.76 4.11 5.17 3.72 4.95 4.3 5.51 4.37 3.02 ...
$ CG : int 22 25 23 19 32 27 23 28 32 48 ...
$ SHO : int 1 0 0 1 1 0 1 0 0 1 ...
$ SV : int 3 1 0 0 0 0 0 0 0 1 ...
$ IPouts : int 828 753 762 507 879 747 678 750 846 1545 ...
$ HA : int 367 308 346 261 373 329 315 431 371 566 ...
$ HRA : int 2 6 13 5 7 3 3 4 4 3 ...
$ BBA : int 42 28 53 21 42 53 34 75 45 63 ...
$ SOA : int 23 22 34 17 22 16 16 12 13 0 ...
$ E : int 225 218 223 163 227 194 220 198 217 432 ...
$ DP : int NA NA NA NA NA NA NA NA NA NA ...
$ FP : num 0.838 0.829 0.814 0.803 0.839 0.845 0.821 0.845 0.85 0.829 ...
$ name : chr "Boston Red Stockings" "Chicago White Stockings" "Cleveland Forest Citys" "Fort Wayne Kekiongas" ...
$ park : chr "South End Grounds I" "Union Base-Ball Grounds" "National Association Grounds" "Hamilton Field" ...
$ attendance : int NA NA NA NA NA NA NA NA NA NA ...
$ BPF : int 103 104 96 101 90 102 97 101 94 106 ...
$ PPF : int 98 102 100 107 88 98 99 100 98 102 ...
$ teamIDBR : chr "BOS" "CHI" "CLE" "KEK" ...
$ teamIDlahman45: chr "BS1" "CH1" "CL1" "FW1" ...
$ teamIDretro : chr "BS1" "CH1" "CL1" "FW1" ...
head(Teams)
Lets analyze a few variables against Runs per game -> Home runs per game -> Stolen Bases per game -> Base on Balls per game
Lets compute these variables
T2002 <- Teams %>% filter(yearID %in% 2002) %>%
mutate (
singles = H - HR - X2B - X3B,
R_per_game = R/G,
HR_per_game = HR/G,
SB_per_game = SB/G,
BB_per_game = BB/G,
AB_per_game = AB/G,
singles_per_game = singles / G,
doubles_per_game = X2B / G,
triples_per_game = X3B / G
)
Teams <- Teams %>% filter(yearID %in% 1961:2001) %>%
mutate (
singles = H - HR - X2B - X3B,
R_per_game = R/G,
HR_per_game = HR/G,
SB_per_game = SB/G,
BB_per_game = BB/G,
AB_per_game = AB/G,
singles_per_game = singles / G,
doubles_per_game = X2B / G,
triples_per_game = X3B / G
)
str(Teams)
'data.frame': 1026 obs. of 57 variables:
$ yearID : int 1961 1961 1961 1961 1961 1961 1961 1961 1961 1961 ...
$ lgID : Factor w/ 7 levels "AA","AL","FL",..: 2 2 2 5 5 2 2 2 2 5 ...
$ teamID : Factor w/ 149 levels "ALT","ANA","ARI",..: 5 16 33 35 38 45 52 64 71 72 ...
$ franchID : Factor w/ 120 levels "ALT","ANA","ARI",..: 6 14 29 26 30 32 41 76 2 57 ...
$ divID : chr NA NA NA NA ...
$ Rank : int 3 6 4 7 1 5 2 9 8 2 ...
$ G : int 163 163 163 156 154 161 163 162 162 154 ...
$ Ghome : int 82 82 81 78 77 81 82 80 82 77 ...
$ W : int 95 76 86 64 93 78 101 61 70 89 ...
$ L : int 67 86 76 90 61 83 61 100 91 65 ...
$ DivWin : chr NA NA NA NA ...
$ WCWin : chr NA NA NA NA ...
$ LgWin : chr "N" "N" "N" "N" ...
$ WSWin : chr "N" "N" "N" "N" ...
$ R : int 691 729 765 689 710 737 841 683 744 735 ...
$ AB : int 5481 5508 5556 5344 5243 5609 5561 5423 5424 5189 ...
$ H : int 1393 1401 1475 1364 1414 1493 1481 1342 1331 1358 ...
$ X2B : int 227 251 216 238 247 257 215 216 218 193 ...
$ X3B : int 36 37 46 51 35 39 53 47 22 40 ...
$ HR : int 149 112 138 176 158 150 180 90 189 157 ...
$ BB : int 581 647 550 539 423 492 673 580 681 596 ...
$ SO : int 902 847 612 1027 761 720 867 772 1068 796 ...
$ SB : int 39 56 100 35 70 34 98 58 37 86 ...
$ CS : int 30 36 40 25 33 11 36 22 28 45 ...
$ HBP : int NA NA NA NA NA NA NA NA NA NA ...
$ SF : int NA NA NA NA NA NA NA NA NA NA ...
$ RA : int 588 792 726 800 653 752 671 863 784 697 ...
$ ER : int 526 687 653 689 575 665 575 745 689 619 ...
$ ERA : num 3.22 4.29 4.06 4.48 3.78 4.15 3.55 4.74 4.31 4.04 ...
$ CG : int 54 35 39 34 46 35 62 32 25 40 ...
$ SHO : int 21 6 3 6 12 12 12 5 5 10 ...
$ SV : int 33 30 33 25 40 23 30 23 34 35 ...
$ IPouts : int 4413 4326 4344 4155 4110 4329 4377 4245 4314 4134 ...
$ HA : int 1226 1472 1491 1492 1300 1426 1404 1519 1391 1346 ...
$ HRA : int 109 167 158 165 147 178 170 141 180 167 ...
$ BBA : int 617 679 498 465 500 599 469 629 713 544 ...
$ SOA : int 926 831 814 755 829 801 836 703 973 1105 ...
$ E : int 126 143 128 183 134 139 146 174 192 136 ...
$ DP : int 173 140 138 175 124 142 147 160 154 162 ...
$ FP : num 0.98 0.977 0.98 0.97 0.977 0.977 0.976 0.972 0.969 0.976 ...
$ name : chr "Baltimore Orioles" "Boston Red Sox" "Chicago White Sox" "Chicago Cubs" ...
$ park : chr "Memorial Stadium" "Fenway Park II" "Comiskey Park" "Wrigley Field" ...
$ attendance : int 951089 850589 1146019 673057 1117603 725547 1600710 683817 603510 1804250 ...
$ BPF : int 96 102 99 101 102 97 103 101 111 108 ...
$ PPF : int 96 103 97 104 101 98 102 103 112 107 ...
$ teamIDBR : chr "BAL" "BOS" "CHW" "CHC" ...
$ teamIDlahman45 : chr "BAL" "BOS" "CHA" "CHN" ...
$ teamIDretro : chr "BAL" "BOS" "CHA" "CHN" ...
$ singles : int 981 1001 1075 899 974 1047 1033 989 902 968 ...
$ R_per_game : num 4.24 4.47 4.69 4.42 4.61 ...
$ HR_per_game : num 0.914 0.687 0.847 1.128 1.026 ...
$ SB_per_game : num 0.239 0.344 0.613 0.224 0.455 ...
$ BB_per_game : num 3.56 3.97 3.37 3.46 2.75 ...
$ AB_per_game : num 33.6 33.8 34.1 34.3 34 ...
$ singles_per_game: num 6.02 6.14 6.6 5.76 6.32 ...
$ doubles_per_game: num 1.39 1.54 1.33 1.53 1.6 ...
$ triples_per_game: num 0.221 0.227 0.282 0.327 0.227 ...
Teams
ds_theme_set()
Teams %>% ggplot(aes(HR_per_game, R_per_game)) +
geom_point(alpha = 0.5) +
# geom_abline()
stat_smooth(method = "lm") # Adds a
Teams %>% ggplot(aes(SB_per_game, R_per_game)) +
geom_point(alpha = 0.5) +
# geom_abline()
stat_smooth(method = "lm") # Adds a
Teams %>% ggplot(aes(BB_per_game, R_per_game)) +
geom_point(alpha = 0.5) +
# geom_abline()
stat_smooth(method = "lm") # Adds a
Teams %>% ggplot(aes(AB_per_game, R_per_game)) +
geom_point(alpha = 0.5) +
# geom_abline()
stat_smooth(method = "lm") # Adds a
However, this is not true if we understand the BaseBall game bette. Homeruns usually results in BB and this is why BB is higher.
Lets check the correlation between BB & Home runs. Also compare Singles with Home runs.
Teams %>% summarize(
cor(HR_per_game, BB_per_game),
cor(HR_per_game, singles_per_game),
cor(BB_per_game, singles_per_game)
)
Bases on Balls are confounded with Home Runs
fit <- lm(R_per_game ~ HR_per_game + BB_per_game, Teams)
fit
Call:
lm(formula = R_per_game ~ HR_per_game + BB_per_game, data = Teams)
Coefficients:
(Intercept) HR_per_game BB_per_game
1.7444 1.5611 0.3874
summary(Teams)
yearID lgID teamID franchID divID Rank G Ghome
Min. :1961 AA: 0 BAL : 41 ANA : 41 Length:1026 Min. : 1.000 Min. :103.0 Min. :44.00
1st Qu.:1973 AL:526 BOS : 41 ATL : 41 Class :character 1st Qu.: 2.000 1st Qu.:161.0 1st Qu.:81.00
Median :1983 FL: 0 CHA : 41 BAL : 41 Mode :character Median : 4.000 Median :162.0 Median :81.00
Mean :1982 NA: 0 CHN : 41 BOS : 41 Mean : 3.762 Mean :158.5 Mean :79.26
3rd Qu.:1993 NL:500 CIN : 41 CHC : 41 3rd Qu.: 5.000 3rd Qu.:162.0 3rd Qu.:81.00
Max. :2001 PL: 0 CLE : 41 CHW : 41 Max. :10.000 Max. :165.0 Max. :84.00
UA: 0 (Other):780 (Other):780
W L DivWin WCWin LgWin WSWin
Min. : 37.00 Min. : 40.00 Length:1026 Length:1026 Length:1026 Length:1026
1st Qu.: 71.00 1st Qu.: 71.00 Class :character Class :character Class :character Class :character
Median : 80.00 Median : 79.00 Mode :character Mode :character Mode :character Mode :character
Mean : 79.18 Mean : 79.18
3rd Qu.: 88.00 3rd Qu.: 88.00
Max. :116.00 Max. :120.00
R AB H X2B X3B HR BB
Min. : 329.0 Min. :3493 Min. : 797 Min. :119.0 Min. :11.00 Min. : 32.0 Min. :275.0
1st Qu.: 628.2 1st Qu.:5423 1st Qu.:1343 1st Qu.:210.2 1st Qu.:28.00 1st Qu.:108.0 1st Qu.:472.2
Median : 689.5 Median :5498 Median :1408 Median :238.0 Median :34.00 Median :132.0 Median :520.0
Mean : 690.0 Mean :5398 Mean :1395 Mean :239.2 Mean :35.08 Mean :135.5 Mean :522.2
3rd Qu.: 755.0 3rd Qu.:5564 3rd Qu.:1475 3rd Qu.:268.0 3rd Qu.:41.00 3rd Qu.:161.0 3rd Qu.:573.0
Max. :1009.0 Max. :5781 Max. :1684 Max. :373.0 Max. :79.00 Max. :264.0 Max. :775.0
SO SB CS HBP SF RA ER
Min. : 379.0 Min. : 17 Min. : 11.00 Min. :29.00 Min. :25.00 Min. : 331.0 Min. : 293.0
1st Qu.: 816.0 1st Qu.: 70 1st Qu.: 41.00 1st Qu.:48.00 1st Qu.:43.00 1st Qu.: 626.0 1st Qu.: 553.2
Median : 903.5 Median : 99 Median : 50.50 Median :57.00 Median :49.00 Median : 688.0 Median : 615.0
Mean : 897.5 Mean :103 Mean : 52.04 Mean :57.72 Mean :48.97 Mean : 690.0 Mean : 617.3
3rd Qu.: 986.8 3rd Qu.:131 3rd Qu.: 61.00 3rd Qu.:65.25 3rd Qu.:54.00 3rd Qu.: 754.8 3rd Qu.: 679.0
Max. :1399.0 Max. :341 Max. :123.00 Max. :89.00 Max. :75.00 Max. :1103.0 Max. :1015.0
NA's :966 NA's :966
ERA CG SHO SV IPouts HA HRA BBA
Min. :2.450 Min. : 1.00 Min. : 0.00 Min. :10.0 Min. :2767 Min. : 827 Min. : 40.0 Min. :268.0
1st Qu.:3.493 1st Qu.:13.00 1st Qu.: 7.00 1st Qu.:29.0 1st Qu.:4299 1st Qu.:1340 1st Qu.:112.0 1st Qu.:475.0
Median :3.855 Median :25.00 Median :10.00 Median :35.0 Median :4341 Median :1408 Median :131.0 Median :521.0
Mean :3.921 Mean :26.72 Mean :10.02 Mean :35.2 Mean :4258 Mean :1395 Mean :135.5 Mean :522.2
3rd Qu.:4.290 3rd Qu.:39.00 3rd Qu.:13.00 3rd Qu.:42.0 3rd Qu.:4377 3rd Qu.:1476 3rd Qu.:158.0 3rd Qu.:572.0
Max. :6.380 Max. :94.00 Max. :30.00 Max. :68.0 Max. :4518 Max. :1734 Max. :241.0 Max. :784.0
SOA E DP FP name park attendance
Min. : 388.0 Min. : 57.0 Min. : 74 Min. :0.9670 Length:1026 Length:1026 Min. : 306763
1st Qu.: 807.2 1st Qu.:113.0 1st Qu.:135 1st Qu.:0.9770 Class :character Class :character 1st Qu.:1097446
Median : 899.0 Median :129.0 Median :148 Median :0.9790 Mode :character Mode :character Median :1589698
Mean : 897.5 Mean :128.7 Mean :148 Mean :0.9789 Mean :1694421
3rd Qu.: 994.0 3rd Qu.:143.0 3rd Qu.:162 3rd Qu.:0.9810 3rd Qu.:2174151
Max. :1344.0 Max. :210.0 Max. :215 Max. :0.9890 Max. :4483350
BPF PPF teamIDBR teamIDlahman45 teamIDretro singles R_per_game
Min. : 90.0 Min. : 90.0 Length:1026 Length:1026 Length:1026 Min. : 576.0 Min. :2.858
1st Qu.: 97.0 1st Qu.: 97.0 Class :character Class :character Class :character 1st Qu.: 946.2 1st Qu.:3.963
Median :100.0 Median :100.0 Mode :character Mode :character Mode :character Median : 993.0 Median :4.326
Mean :100.2 Mean :100.2 Mean : 984.9 Mean :4.355
3rd Qu.:103.0 3rd Qu.:103.0 3rd Qu.:1042.0 3rd Qu.:4.734
Max. :129.0 Max. :129.0 Max. :1239.0 Max. :6.228
HR_per_game SB_per_game BB_per_game AB_per_game singles_per_game doubles_per_game triples_per_game
Min. :0.2909 Min. :0.1090 Min. :2.130 Min. :32.20 Min. :5.006 Min. :0.9264 Min. :0.0679
1st Qu.:0.6759 1st Qu.:0.4419 1st Qu.:2.995 1st Qu.:33.71 1st Qu.:5.950 1st Qu.:1.3272 1st Qu.:0.1728
Median :0.8328 Median :0.6230 Median :3.263 Median :34.05 Median :6.191 Median :1.4952 Median :0.2160
Mean :0.8547 Mean :0.6510 Mean :3.295 Mean :34.05 Mean :6.215 Mean :1.5109 Mean :0.2213
3rd Qu.:1.0062 3rd Qu.:0.8210 3rd Qu.:3.589 3rd Qu.:34.42 3rd Qu.:6.475 3rd Qu.:1.6790 3rd Qu.:0.2593
Max. :1.6296 Max. :2.1180 Max. :4.784 Max. :35.69 Max. :7.601 Max. :2.3025 Max. :0.4877
Tibbles display much better
Teams
library(tidyverse)
[30m── [1mAttaching packages[22m ────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mtibble [30m 2.0.1 [32m✔[30m [34mpurrr [30m 0.2.5
[32m✔[30m [34mreadr [30m 1.1.1 [32m✔[30m [34mstringr[30m 1.3.1
[32m✔[30m [34mtibble [30m 2.0.1 [32m✔[30m [34mforcats[30m 0.3.0[39m
package ‘tibble’ was built under R version 3.5.2[30m── [1mConflicts[22m ───────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
library(tidyquant)
Loading required package: lubridate
Attaching package: ‘lubridate’
The following object is masked from ‘package:base’:
date
Loading required package: PerformanceAnalytics
Loading required package: xts
Loading required package: zoo
Attaching package: ‘zoo’
The following objects are masked from ‘package:base’:
as.Date, as.Date.numeric
Attaching package: ‘xts’
The following objects are masked from ‘package:dplyr’:
first, last
Package PerformanceAnalytics (1.5.2) loaded.
Copyright (c) 2004-2018 Peter Carl and Brian G. Peterson, GPL-2 | GPL-3
https://github.com/braverock/PerformanceAnalytics
Attaching package: ‘PerformanceAnalytics’
The following object is masked from ‘package:graphics’:
legend
Loading required package: quantmod
Loading required package: TTR
Version 0.4-0 included new data defaults. See ?getSymbols.
Learn from a quantmod author: https://www.datacamp.com/courses/importing-and-managing-financial-data-in-r
t <- as_tibble(Teams)
t
Create a regression line for each strata
dat <- Teams %>% filter(yearID %in% 1961:2001) %>%
mutate(HR = round(HR/G, 1),
BB = BB / G,
R = R / G) %>%
select(HR, BB, R) %>%
filter(HR >= 0.4 & HR <= 1.2)
dat %>% group_by(HR) %>% do(fit = lm(R ~ BB, data = . ) )
Create a function that returns a dataframe instread of a function
get_slope <- function(data) {
fit <- lm(R ~ BB, data = data)
sum.fit <- summary(fit)
data.frame(slope = sum.fit$coefficients[2, "Estimate"],
se = sum.fit$coefficients[2, "Std. Error"],
pvalue = sum.fit$coefficients[2, "Pr(>|t|)"])
}
dat %>% group_by(HR) %>% do(get_slope(.))
To know whether the relationship between home runs and runs per game varies by baseball league.
library(broom)
dat <- Teams %>% select(lgID, HR_per_game, BB_per_game, R_per_game)
dat %>%
group_by(lgID) %>%
do(glance(lm(R_per_game ~ HR_per_game, data = .)))
fit <- Teams %>% lm(R_per_game ~ HR_per_game + BB_per_game, data = .)
tidy(fit)
tidy(fit, conf.int = TRUE)
fit <- Teams %>% lm(R_per_game ~ BB_per_game + singles_per_game + doubles_per_game + triples_per_game + HR_per_game, data = .)
tidy(fit, conf.int = TRUE)
Lets predit for 2002 based on the above model
t %>% ggplot(aes(R_pred, R_per_game)) +
geom_point() +
geom_text(aes(label = teamID), nudge_x = 0.06) +
# stat_smooth(method = "lm") +
geom_abline(slope = 1, intercept = 0)
Lets look at batting dataset
pa_per_game
[1] 38.58408
Imagine you have two teams. Team A is comprised of batters who, on average, get two bases on balls, four singles, one double, and one home run. Team B is comprised of batters who, on average, get one base on balls, six singles, two doubles, and one triple.
predict(fit, newdata = nd)
1 2
2.265375 3.500614
To be sure what data corresponds to which column, we can mutate the column into dataframe.